#!/bin/bash

LINK="ru.wiktionary.org/wiki/"


TO="--connect-timeout 10 -m 20"
HDR='-H "Connection: keep-alive" -H "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" -H "Accept-Encoding: gzip,deflate" -H "Accept-Charset: windows-1251,utf-8;q=0.7,*;q=0.7" -H "Keep-Alive: 115" -H "Accept-Language: ru,en-us;q=0.7,en;q=0.3"'
AGENT='"Mozilla/5.0 (X11; U; Linux x86_64; ru; rv:1.9.2.12) Gecko/20101027 Ubuntu/10.04 (lucid) Firefox/3.6.12"'
HOST='ru.wiktionary.org'
DICFILE="dic.txt"
DBFILE="db.txt"
OUT="out.gz"
DBG="" #"-v"
TMPCMD="cmd.tmp"
TMP="/tmp/mytempfile"
GLOBAL_RET=""

DoUtf ()
{
	local word="$1" mas

	echo -n "$word" > $TMP
	mas="`hd $TMP | grep " "|cut --complement -d " " -f 1 |
	      cut -d "|" -f1 |tr -t "\n" " "|sed -r -e "s/ [ ]*/ /g" |sed -e "s/ /%/g"|sed -e "s/%$//"`"
	mas="`echo "$mas" | tr '[:lower:]' '[:upper:]'`"
	GLOBAL_RET="$mas"
	return 0
}

IPPAT='/wiki/%D0%B8%D0%BC%D0%B5%D0%BD%D0%B8%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D0%B9'
RPPAT='/wiki/%D1%80%D0%BE%D0%B4%D0%B8%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D0%B9'
DPPAT='/wiki/%D0%B4%D0%B0%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D0%B9'
VPPAT='/wiki/%D0%B2%D0%B8%D0%BD%D0%B8%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D0%B9'
TPPAT='/wiki/%D1%82%D0%B2%D0%BE%D1%80%D0%B8%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D0%B9'
PPPAT='/wiki/%D0%BF%D1%80%D0%B5%D0%B4%D0%BB%D0%BE%D0%B6%D0%BD%D1%8B%D0%B9'

Parse ()
{
	local file="$1" 
	local ip rp dp vp tp pp
	local ip1 ip2 rp1 rp2 dp1 dp2 vp1 vp2 tp1 tp2 pp1 pp2

	ip="`grep "$IPPAT" -A 2 $file | head -n 3 | tail -n 2`"
	rp="`grep "$RPPAT" -A 2 $file | head -n 3 | tail -n 2`"
	dp="`grep "$DPPAT" -A 2 $file | head -n 3 | tail -n 2`"
	vp="`grep "$VPPAT" -A 2 $file | head -n 3 | tail -n 2`"
	tp="`grep "$TPPAT" -A 2 $file | head -n 3 | tail -n 2 `"
	pp="`grep "$PPPAT" -A 2 $file | head -n 3 | tail -n 2`"

	ip1="`echo "$ip" | sed -n "1 p" | cut -d ">" -f 2 | cut -d "<" -f 1|cut -d ',' -f 1`"
	ip2="`echo "$ip" | sed -n "2 p" | cut -d ">" -f 2 | cut -d "<" -f 1|cut -d ',' -f 1`"

	rp1="`echo "$rp" | sed -n "1 p" | cut -d ">" -f 2 | cut -d "<" -f 1|cut -d ',' -f 1`"
	rp2="`echo "$rp" | sed -n "2 p" | cut -d ">" -f 2 | cut -d "<" -f 1|cut -d ',' -f 1`"

	dp1="`echo "$dp" | sed -n "1 p" | cut -d ">" -f 2 | cut -d "<" -f 1|cut -d ',' -f 1`"
	dp2="`echo "$dp" | sed -n "2 p" | cut -d ">" -f 2 | cut -d "<" -f 1|cut -d ',' -f 1`"

	vp1="`echo "$vp" | sed -n "1 p" | cut -d ">" -f 2 | cut -d "<" -f 1|cut -d ',' -f 1`"
	vp2="`echo "$vp" | sed -n "2 p" | cut -d ">" -f 2 | cut -d "<" -f 1|cut -d ',' -f 1`"
	
	tp1="`echo "$tp" | sed -n "1 p" | cut -d ">" -f 2 | cut -d "<" -f 1|cut -d ',' -f 1`"
	tp2="`echo "$tp" | sed -n "2 p" | cut -d ">" -f 2 | cut -d "<" -f 1|cut -d ',' -f 1`"

	pp1="`echo "$pp" | sed -n "1 p" | cut -d ">" -f 2 | cut -d "<" -f 1|cut -d ',' -f 1`"
	pp2="`echo "$pp" | sed -n "2 p" | cut -d ">" -f 2 | cut -d "<" -f 1|cut -d ',' -f 1`"
	
	echo "$ip" | grep -q '*' 
	if [ $? -eq 0 ];then
		ip2="$ip1"
	fi
	echo "$rp" | grep -q '*' 
	if [ $? -eq 0 ];then
		rp2="$rp1"
	fi
	echo "$dp" | grep -q '*' 
	if [ $? -eq 0 ];then
		dp2="$dp1"
	fi
	echo "$vp" | grep -q '*' 
	if [ $? -eq 0 ];then
		vp2="$vp1"
	fi
	echo "$tp" | grep -q '*' 
	if [ $? -eq 0 ];then
		tp2="$tp1"
	fi
	echo "$pp" | grep -q '*' 
	if [ $? -eq 0 ];then
		pp2="$pp1"
	fi

	GLOBAL_RET="$ip1 $rp1 $dp1 $vp1 $tp1 $pp1 $ip2 $rp2 $dp2 $vp2 $tp2 $pp2" 
	return 0
}

i=1
while [ $i -lt 100 ];do
	line="`sed -n "$i p" $DICFILE`"
	if [ "$line" = "" ];then
		break
	fi
	DoUtf "$line"
	line="$LINK$GLOBAL_RET"
	echo "line=$line"

	echo "curl $DBG $TO $HDR -A $AGENT -e $HOST '$line' > $OUT " > $TMPCMD
	echo 'exit $?' >>$TMPCMD
	chmod +x $TMPCMD
	sh $TMPCMD
	rm -f out
	gunzip $OUT
	Parse out
	echo "$GLOBAL_RET" >> $DBFILE
	i=`expr $i + 1`
done
#rm -f $TMPCMD
